Example 1

Чтение JSON из файла.


In [5]:
path = './data/usagov_bitly_data2013-05-17-1368832207'

In [6]:
open(path).readline()


Out[6]:
'{ "a": "Mozilla\\/5.0 (Linux; U; Android 4.1.2; en-us; HTC_PN071 Build\\/JZO54K) AppleWebKit\\/534.30 (KHTML, like Gecko) Version\\/4.0 Mobile Safari\\/534.30", "c": "US", "nk": 0, "tz": "America\\/Los_Angeles", "gr": "CA", "g": "15r91", "h": "10OBm3W", "l": "pontifier", "al": "en-US", "hh": "j.mp", "r": "direct", "u": "http:\\/\\/www.nsa.gov\\/", "t": 1368832205, "hc": 1365701422, "cy": "Anaheim", "ll": [ 33.816101, -117.979401 ] }\n'

In [9]:
import json

In [10]:
records = [json.loads(line) for line in open(path)]

In [11]:
records[0]


Out[11]:
{u'a': u'Mozilla/5.0 (Linux; U; Android 4.1.2; en-us; HTC_PN071 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
 u'al': u'en-US',
 u'c': u'US',
 u'cy': u'Anaheim',
 u'g': u'15r91',
 u'gr': u'CA',
 u'h': u'10OBm3W',
 u'hc': 1365701422,
 u'hh': u'j.mp',
 u'l': u'pontifier',
 u'll': [33.816101, -117.979401],
 u'nk': 0,
 u'r': u'direct',
 u't': 1368832205,
 u'tz': u'America/Los_Angeles',
 u'u': u'http://www.nsa.gov/'}

In [13]:
records[0]['tz']


Out[13]:
u'America/Los_Angeles'

Подсчет часовых поясов на чистом Python


In [45]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [52]:
time_zones[:10]


Out[52]:
[u'America/Los_Angeles',
 u'',
 u'America/Phoenix',
 u'America/Chicago',
 u'',
 u'America/Indianapolis',
 u'America/Chicago',
 u'',
 u'Australia/NSW',
 u'']

1 способ подсчета


In [53]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

2 способ подсчета


In [54]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # value will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

Пример использования


In [55]:
counts = get_counts(time_zones)

In [56]:
counts['America/Los_Angeles']


Out[56]:
421

Подсчет часовых зон


In [57]:
len(time_zones)


Out[57]:
3839

In [58]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [59]:
top_counts(counts)


Out[59]:
[(40, u'America/Phoenix'),
 (50, u'America/Indianapolis'),
 (85, u'Europe/London'),
 (89, u'America/Denver'),
 (102, u'Asia/Tokyo'),
 (184, u'America/Puerto_Rico'),
 (421, u'America/Los_Angeles'),
 (636, u''),
 (686, u'America/Chicago'),
 (903, u'America/New_York')]

In [60]:
from collections import Counter

In [61]:
counts = Counter(time_zones)

In [62]:
counts.most_common(10)


Out[62]:
[(u'America/New_York', 903),
 (u'America/Chicago', 686),
 (u'', 636),
 (u'America/Los_Angeles', 421),
 (u'America/Puerto_Rico', 184),
 (u'Asia/Tokyo', 102),
 (u'America/Denver', 89),
 (u'Europe/London', 85),
 (u'America/Indianapolis', 50),
 (u'America/Phoenix', 40)]

Подсчет часовых поясов с помощью pandas


In [64]:
from pandas import DataFrame, Series

In [66]:
import pandas as pd

In [67]:
frame = DataFrame(records)

In [68]:
frame


Out[68]:
_heartbeat_ a al c cy g gr h hc hh kw l ll nk r t tz u
0 NaN Mozilla/5.0 (Linux; U; Android 4.1.2; en-us; H... en-US US Anaheim 15r91 CA 10OBm3W 1.365701e+09 j.mp NaN pontifier [33.816101, -117.979401] 0.0 direct 1.368832e+09 America/Los_Angeles http://www.nsa.gov/
1 NaN Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... en-us None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://www.usa.gov/ 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
2 NaN Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20... en-US,en;q=0.5 US Fort Huachuca 10DaxOu AZ 10DaxOt 1.368815e+09 1.usa.gov NaN jaxstrong [31.5273, -110.360703] 1.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.368832e+09 America/Phoenix http://www.saj.usace.army.mil/Media/NewsReleas...
3 NaN Mozilla/5.0 (Linux; U; Android 4.1.2; en-us; S... en-US US Houston TysVFU TX TChsoQ 1.354719e+09 1.usa.gov NaN o_5004fs3lvd [29.7633, -95.363297] 1.0 http://m.facebook.com/l.php?u=http%3A%2F%2F1.u... 1.368832e+09 America/Chicago https://nationalregistry.fmcsa.dot.gov/
4 NaN Opera/9.80 (Android; Opera Mini/7.5.33286/29.3... en None NaN 10IGW7m NaN 10IGW7l 1.368738e+09 1.usa.gov NaN peacecorps NaN 0.0 http://t.co/CDO9hLTtNT 1.368832e+09 http://www.peacecorps.gov/learn/howvol/ab530gr...
5 NaN Mozilla/5.0 (compatible; MSIE 10.0; Windows NT... en-US US Mishawaka 13GrCeP IN 13GrCeP 1.368131e+09 1.usa.gov NaN bitly [41.612301, -86.1381] 0.0 direct 1.368832e+09 America/Indianapolis https://petitions.whitehouse.gov/petition/repe...
6 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) G... en-US,en;q=0.5 US Hammond YmtpnZ WI YmtpnZ 1.363712e+09 1.usa.gov NaN bitly [45.007, -92.459099] 1.0 http://www.bwsd.k12.wi.us/SitePages/Home.aspx 1.368832e+09 America/Chicago http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
7 NaN Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_5 li... en-us None NaN 13oM0hV NaN 15PUeH0 1.368714e+09 go.nasa.gov NaN nasatwitter NaN 0.0 http://t.co/YIsVhFDLj2 1.368832e+09 http://www.nasa.gov/multimedia/imagegallery/im...
8 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like ... en-us AU Sydney 15r91 02 10OBm3W 1.365701e+09 j.mp NaN pontifier [-33.8615, 151.205505] 0.0 direct 1.368832e+09 Australia/NSW http://www.nsa.gov/
9 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 None NaN 109LtDc NaN 109LtDb 1.368822e+09 go.nasa.gov NaN nasatwitter NaN 0.0 http://t.co/yPSKO2t5v1 1.368832e+09 http://www.nasa.gov/mission_pages/sunearth/new...
10 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like ... en-us US Middletown 109ar5F OH 109ar5E 1.368804e+09 1.usa.gov NaN usairforce [39.515099, -84.3983] 1.0 https://m.facebook.com 1.368832e+09 America/New_York http://www.dodlive.mil/index.php/2013/05/the-2...
11 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like ... en-us US Germantown 107xZnW MD 107xZnW 1.368815e+09 1.usa.gov NaN bitly [39.131699, -77.288002] 0.0 http://t.co/u8qVCKx8RK 1.368832e+09 America/New_York http://doggett.house.gov/index.php/news/571-do...
12 NaN Mozilla/5.0 (iPad; CPU OS 6_1_2 like Mac OS X)... en-us US Richmond 19AcekS KY 19AcekR 1.368738e+09 1.usa.gov NaN peacecorps [37.766602, -84.303101] 1.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.368832e+09 America/New_York http://www.peacecorps.gov/learn/howvol/ab530gr...
13 NaN Mozilla/5.0 (Windows NT 5.1; rv:20.0) Gecko/20... en-US,en;q=0.5 US Portland 16mY628 OR 16mY627 1.368744e+09 1.usa.gov NaN pbierce [45.529499, -122.643204] 1.0 http://t.co/T8EyBbUBJ8 1.368832e+09 America/Los_Angeles http://www.fws.gov/cno/press/release.cfm?rid=493
14 NaN Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... en-us US Aurora YRyW8K IL YRyW8K 1.368476e+09 1.usa.gov NaN bitly [41.760601, -88.320099] 0.0 http://www.z2systems.com/np/clients/kca/news.j... 1.368832e+09 America/Chicago http://www.cancer.gov/PublishedContent/Images/...
15 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; r... en-US,en;q=0.5 US Houston 18NUp44 TX 18NUoNR 1.368727e+09 1.usa.gov NaN o_1fs5ea3lim [29.7633, -95.363297] 0.0 http://t.co/s307mx2qGk 1.368832e+09 America/Chicago http://www.army.mil/article/103380/
16 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Muskego YmtpnZ WI YmtpnZ 1.363712e+09 1.usa.gov NaN bitly [42.877602, -88.133797] 0.0 http://www.cudahy.k12.wi.us/ 1.368832e+09 America/Chicago http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
17 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-us US Arvada ZPictr CO ZPictq 1.366901e+09 1.usa.gov NaN o_d63rn9enb [39.802799, -105.087502] 1.0 direct 1.368832e+09 America/Denver http://www.nws.noaa.gov/com/weatherreadynation...
18 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)... en-us US Bend 11C6yJk OR 19oVtZN 1.368558e+09 1.usa.gov NaN raylahood [44.074402, -121.257401] 1.0 direct 1.368832e+09 America/Los_Angeles http://fastlane.dot.gov/2013/05/new-locomotive...
19 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) G... en-US,en;q=0.5 US Laurel 15RP5hF MD 16Ewvc4 1.368818e+09 1.usa.gov NaN rebroth [39.135799, -76.872002] 0.0 http://t.co/Dv6Jqbwu8H 1.368832e+09 America/New_York http://apod.nasa.gov/apod/ap130517.html
20 NaN Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... en-us US Seattle 12yP2Cx WA 12yP2Cw 1.368742e+09 1.usa.gov NaN o_6vo5h05abv [47.606201, -122.3321] 1.0 http://t.co/7K9urpYyc6 1.368832e+09 America/Los_Angeles http://www.ice.gov/news/releases/1305/130516sa...
21 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 None NaN 109ar5F NaN 109ar5E 1.368804e+09 1.usa.gov NaN usairforce NaN 0.0 https://www.facebook.com/ 1.368832e+09 http://www.dodlive.mil/index.php/2013/05/the-2...
22 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Durand YmtpnZ WI YmtpnZ 1.363712e+09 1.usa.gov NaN bitly [44.590698, -91.891197] 0.0 http://www.alma.k12.wi.us/ 1.368832e+09 America/Chicago http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
23 NaN Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; L... en-us,en;q=0.9 None NaN 107xZnW NaN 107xZnW 1.368815e+09 1.usa.gov NaN bitly NaN 0.0 http://m.facebook.com/l.php?u=http%3A%2F%2F1.u... 1.368832e+09 http://doggett.house.gov/index.php/news/571-do...
24 NaN Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... NaN None NaN YmtpnZ NaN YmtpnZ 1.363712e+09 1.usa.gov NaN bitly NaN 0.0 http://www.wabeno.k12.wi.us/ 1.368832e+09 http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
25 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us,en;q=0.5 None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://addthis.com/hemmings 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
26 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us,en;q=0.5 None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://addthis.com/hemmings 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
27 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us,en;q=0.5 None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://addthis.com/hemmings 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
28 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us,en;q=0.5 None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://addthis.com/hemmings 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
29 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us,en;q=0.5 None NaN ifIpBW NaN ifIpBW 1.302189e+09 1.usa.gov NaN bitly NaN 0.0 http://addthis.com/hemmings 1.368832e+09 http://answers.usa.gov/system/selfservice.cont...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3929 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... ja,en-US;q=0.8,en;q=0.6 None NaN 10Kc32m NaN 10Kc32l 1.368809e+09 go.nasa.gov NaN nasatwitter NaN 1.0 http://t.co/HgiLLFRDtE 1.368836e+09 http://www.nasa.gov/mission_pages/station/expe...
3930 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; r... fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3 CH Chambesy 14bmsHn 07 14bmsHn 1.368224e+09 1.usa.gov NaN bitly [46.242401, 6.1435] 0.0 direct 1.368836e+09 Europe/Zurich http://gsaauctions.gov/gsaauctions/aucdsclnk?s...
3931 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)... en-us US Rockville 14bmsHn MD 14bmsHn 1.368224e+09 1.usa.gov NaN bitly [39.089199, -77.183502] 1.0 direct 1.368836e+09 America/New_York http://gsaauctions.gov/gsaauctions/aucdsclnk?s...
3932 NaN Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.3... en-US,en;q=0.8 None NaN 15r91 NaN 10OBm3W 1.365701e+09 j.mp NaN pontifier NaN 0.0 direct 1.368836e+09 http://www.nsa.gov/
3933 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like ... en-us US Ann Arbor 1084Psg MI 1084Psg 1.368756e+09 j.mp NaN bitly [42.216702, -83.740601] 1.0 http://t.co/orOTdRX5aF 1.368836e+09 America/New_York http://science.nasa.gov/science-news/science-a...
3934 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) G... en-US,en;q=0.5 US Thomasville 15r91 NC 10OBm3W 1.365701e+09 j.mp NaN pontifier [35.882599, -80.082001] 0.0 direct 1.368836e+09 America/New_York http://www.nsa.gov/
3935 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Coronado Ntl Forest 186NWQK AZ 186NWQK 1.368829e+09 1.usa.gov NaN bitly [31.9582, -110.693001] 0.0 https://www.facebook.com/ 1.368836e+09 America/Phoenix http://cms3.tucsonaz.gov/files/police/media-re...
3936 NaN Mozilla/5.0 (compatible; Genieo/1.0 http://www... NaN US Manhattan Beach YYv1XQ CA YYv1XQ 1.368711e+09 1.usa.gov NaN bitly [33.889301, -118.401001] 0.0 direct 1.368836e+09 America/Los_Angeles http://www.irs.gov/uac/Newsroom/Tax-Relief-for...
3937 NaN Mozilla/5.0 (Linux; U; Android 4.0.3; en-gb; H... en-GB, en-US None NaN 12AyUk2 NaN 12AyUk1 1.368808e+09 go.nasa.gov NaN nasatwitter NaN 1.0 direct 1.368836e+09 http://www.jpl.nasa.gov/news/news.php?release=...
3938 NaN Mozilla/5.0 (Linux; U; Android 4.1.2; es-es; G... es-ES, en-US None NaN 12AyUk2 NaN 12AyUk1 1.368808e+09 go.nasa.gov NaN nasatwitter NaN 1.0 http://t.co/q6402O6lFC 1.368836e+09 http://www.jpl.nasa.gov/news/news.php?release=...
3939 NaN ShortLinkTranslate NaN JP Kashiwa YPnFn4 04 YPnFn3 1.368833e+09 1.usa.gov NaN hayano [35.854401, 139.968903] 0.0 direct 1.368836e+09 Asia/Tokyo http://www.doe.gov/articles/energy-department-...
3940 NaN Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; L... en-US None NaN 15r91 NaN 10OBm3W 1.365701e+09 j.mp NaN pontifier NaN 0.0 direct 1.368836e+09 http://www.nsa.gov/
3941 NaN Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X)... en-us US Marshfield YmtpnZ WI YmtpnZ 1.363712e+09 1.usa.gov NaN bitly [44.6688, -90.171799] 0.0 http://www.colby.k12.wi.us/ 1.368836e+09 America/Chicago http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
3942 NaN Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X)... en-US,en;q=0.8 US Vaughn 16uqtLe WA 16uqtLd 1.368455e+09 1.usa.gov NaN o_33avl0ri1b [47.314499, -122.778503] 0.0 http://fwp.mt.gov/ 1.368836e+09 America/Los_Angeles http://fwp.mt.gov/hunting/hunterAccess/openFie...
3943 NaN Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... en-US US Orlando 10WMBv9 FL 10WMBv9 1.368827e+09 1.usa.gov NaN bitly [28.3899, -81.4366] 0.0 http://www.elnuevodia.com/brillanteexplosionen... 1.368836e+09 America/New_York http://science.nasa.gov/media/medialibrary/201...
3944 NaN Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; D... en-US US Lakewood 14bmsHn OH 14bmsHn 1.368224e+09 1.usa.gov NaN bitly [41.481701, -81.802399] 0.0 direct 1.368836e+09 America/New_York http://gsaauctions.gov/gsaauctions/aucdsclnk?s...
3945 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) G... en-US,en;q=0.5 US Boone 10X5IW8 NC 10X5IW7 1.368835e+09 1.usa.gov NaN inws [36.219101, -81.656303] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.368836e+09 America/New_York http://inws.wrh.noaa.gov/weather/alertinfo/103...
3946 NaN Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3... en-US,en;q=0.8 None NaN YmtpnZ NaN YmtpnZ 1.363712e+09 1.usa.gov NaN bitly NaN 1.0 http://www.cudahy.k12.wi.us/ 1.368836e+09 http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
3947 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)... en-US,en;q=0.8 US Logan 107xZnW UT 107xZnW 1.368815e+09 1.usa.gov NaN bitly [41.641201, -111.896599] 0.0 https://www.facebook.com/ 1.368836e+09 America/Denver http://doggett.house.gov/index.php/news/571-do...
3948 NaN Mozilla/5.0 (Linux; Android 4.0.4; SO-03D Buil... ja,en-US;q=0.8,en;q=0.6 JP Tokyo 15TFyGK 40 15TFyGJ 1.368810e+09 go.nasa.gov NaN nasatwitter [35.685001, 139.751404] 1.0 direct 1.368836e+09 Asia/Tokyo http://www.nasa.gov/mission_pages/mer/news/mer...
3949 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) G... en-gb,en;q=0.5 US Castro Valley 11C6yJk CA 19oVtZN 1.368558e+09 1.usa.gov NaN raylahood [37.709, -122.088501] 0.0 http://www.cahighspeedrail.ca.gov/ 1.368836e+09 America/Los_Angeles http://fastlane.dot.gov/2013/05/new-locomotive...
3950 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_4 like ... en-us US Fayetteville 10ydrrV GA 10ydrrU 1.368806e+09 1.usa.gov NaN fsanewmedia [33.481098, -84.479797] 0.0 http://t.co/psBn8njvIB 1.368836e+09 America/New_York http://studentaid.ed.gov/repay-loans/understan...
3951 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-us US Salem 107xZnW VA 107xZnW 1.368815e+09 1.usa.gov NaN bitly [37.2906, -80.101402] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.368836e+09 America/New_York http://doggett.house.gov/index.php/news/571-do...
3952 NaN Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X)... en-us US Grayson 107xZnW KY 107xZnW 1.368815e+09 1.usa.gov NaN bitly [38.336399, -82.992401] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.368836e+09 America/New_York http://doggett.house.gov/index.php/news/571-do...
3953 1.368836e+09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3954 NaN Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; M... en-US US Mobile 10WWSaR AL 10WWSaQ 1.368831e+09 1.usa.gov NaN inws [30.657499, -88.1586] 1.0 http://m.facebook.com/l.php?u=http%3A%2F%2F1.u... 1.368836e+09 America/Chicago http://inws.wrh.noaa.gov/weather/alertinfo/103...
3955 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)... en-us US Brookfield YmtpnZ WI YmtpnZ 1.363712e+09 1.usa.gov NaN bitly [43.060799, -88.1558] 0.0 http://www.cudahy.k12.wi.us/ 1.368836e+09 America/Chicago http://pld.dpi.wi.gov/files/pld/images/LinkWI.png
3956 NaN ShortLinkTranslate NaN JP Tsukuba YPnFn4 14 YPnFn3 1.368833e+09 1.usa.gov NaN hayano [36.083302, 140.116699] 0.0 direct 1.368836e+09 Asia/Tokyo http://www.doe.gov/articles/energy-department-...
3957 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_4 like ... en-us None NaN 17B6VoC NaN 16n91ZK 1.368747e+09 go.nasa.gov NaN nasatwitter NaN 0.0 http://t.co/XLS75r3BCB 1.368836e+09 http://www.jpl.nasa.gov/news/news.php?release=...
3958 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-us PR Guaynabo 10WMBv9 00 10WMBv9 1.368827e+09 1.usa.gov NaN bitly [18.3876, -66.110802] 1.0 http://www.elnuevodia.com/brillanteexplosionen... 1.368836e+09 America/Puerto_Rico http://science.nasa.gov/media/medialibrary/201...

3959 rows × 18 columns


In [70]:
frame['tz'][:10]


Out[70]:
0     America/Los_Angeles
1                        
2         America/Phoenix
3         America/Chicago
4                        
5    America/Indianapolis
6         America/Chicago
7                        
8           Australia/NSW
9                        
Name: tz, dtype: object

In [71]:
tz_counts = frame['tz'].value_counts()

In [72]:
tz_counts[:10]


Out[72]:
America/New_York        903
America/Chicago         686
                        636
America/Los_Angeles     421
America/Puerto_Rico     184
Asia/Tokyo              102
America/Denver           89
Europe/London            85
America/Indianapolis     50
America/Phoenix          40
Name: tz, dtype: int64

Строим график


In [112]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt

In [73]:
clean_tz = frame['tz'].fillna('Missing')

In [74]:
clean_tz[clean_tz == ''] = 'Unknown'

In [75]:
tz_counts = clean_tz.value_counts()

In [76]:
tz_counts[:10]


Out[76]:
America/New_York        903
America/Chicago         686
Unknown                 636
America/Los_Angeles     421
America/Puerto_Rico     184
Missing                 120
Asia/Tokyo              102
America/Denver           89
Europe/London            85
America/Indianapolis     50
Name: tz, dtype: int64

In [113]:
tz_counts[:10].plot(kind='barh', rot=0)


Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8ee8f3d610>

In [78]:
frame['a'][1]


Out[78]:
u'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR)'

In [79]:
frame['a'][50]


Out[79]:
u'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Mobile/10B329'

In [80]:
frame['a'][51]


Out[80]:
u'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Mobile/10B329'

получаем информацию о браузере


In [83]:
results = Series([x.split()[0] for x in frame.a.dropna()])

In [84]:
results[:5]


Out[84]:
0    Mozilla/5.0
1    Mozilla/4.0
2    Mozilla/5.0
3    Mozilla/5.0
4     Opera/9.80
dtype: object

In [85]:
results.value_counts()[:8]


Out[85]:
Mozilla/5.0           3251
Mozilla/4.0            322
CakePHP                 38
ShortLinkTranslate      36
TVersity                30
Opera/9.80              28
Dalvik/1.6.0            19
Xenu                    15
dtype: int64

Получить пользователей Windows из первых 10 часовых поясов


In [91]:
import numpy as np

In [87]:
cframe = frame[frame.a.notnull()]

In [92]:
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

In [93]:
operating_system[:5]


Out[93]:
array(['Not Windows', 'Windows', 'Windows', 'Not Windows', 'Not Windows'], 
      dtype='|S11')

In [95]:
by_tz_os = cframe.groupby(['tz', operating_system])

In [96]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [97]:
agg_counts[:10]


Out[97]:
Not Windows Windows
tz
484.0 152.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 4.0 2.0
Africa/Gaborone 0.0 1.0
Africa/Johannesburg 2.0 0.0
America/Anchorage 5.0 3.0
America/Argentina/Buenos_Aires 4.0 7.0
America/Argentina/Catamarca 1.0 0.0
America/Argentina/Cordoba 0.0 2.0

In [100]:
# Нужен для сортировки в порядке возрастания
indexer = agg_counts.sum(1).argsort()

In [101]:
indexer[:10]


Out[101]:
tz
                                   55
Africa/Cairo                      101
Africa/Casablanca                 100
Africa/Ceuta                       36
Africa/Gaborone                    97
Africa/Johannesburg                42
America/Anchorage                  43
America/Argentina/Buenos_Aires     44
America/Argentina/Catamarca        47
America/Argentina/Cordoba          50
dtype: int64

In [102]:
count_subset = agg_counts.take(indexer)[-10:]

In [103]:
count_subset


Out[103]:
Not Windows Windows
tz
America/Phoenix 22.0 18.0
America/Indianapolis 29.0 21.0
Europe/London 62.0 23.0
America/Denver 41.0 48.0
Asia/Tokyo 88.0 14.0
America/Puerto_Rico 93.0 91.0
America/Los_Angeles 207.0 214.0
484.0 152.0
America/Chicago 343.0 343.0
America/New_York 550.0 353.0

In [114]:
count_subset.plot(kind='barh', stacked=True)


Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8ee8be78d0>

In [115]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)

In [116]:
normed_subset.plot(kind='barh', stacked=True)


Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8ee8ae7690>

In [ ]: